با استفاده از بسته gutenberg داده های لازم را به دست آورید و به سوالات زیر پاسخ دهید.


۱. چارلز دیکنز نویسنده معروف انگلیسی بالغ بر چهارده رمان (چهارده و نیم) نوشته است. متن تمامی کتاب های او را دانلود کنید و سپس بیست لغت برتر استفاده شده را به صورت یک نمودار ستونی نمایش دهید. (طبیعتا باید ابتدا متن را پاکسازی کرده و stopping words را حذف نمایید تا به کلماتی که بار معنایی مشخصی منتقل می کنند برسید.)

books = list(BarnabyRudge, BleakHouse, DavidCopperfield, DombeyandSon, GreatExpectations,
             HardTimes, LittleDorrit, MartinChuzzlewit, NicholasNickleby, OliverTwist, OurMutualFriend,
             TheMysteryofEdwinDrood, TheOldCuriosityShop, ThePickwickPapers, ATaleofTwoCities
             )

books_text <- list()
books_words_vec <- c()
for (book in books) {
  str_replace_all(book$text,"(?!')[[:punct:]]", " ") %>% str_trim() -> book_punct_free
  
  book_punct_free %>%
    str_split("\\s+") %>%
    unlist() -> book_words_vector
    books_words_vec <- c(books_words_vec, book_words_vector)
  book_words_vector[!(str_to_lower(book_words_vector) %in% stop_words$word)] %>% 
    str_replace_all("[[:punct:]]", " ") %>% str_trim() -> book_punct_free_2
  
  book_punct_free_2 %>%
    str_split("\\s+") %>%
    unlist() %>%
    table() %>%
    as.data.frame() -> book_words_df
  book$gutenberg_id[1] -> id
  
  dplyr::select(book_words_df, word = `.`, freq = Freq)%>%
    filter(str_length(word) >= 3) %>% 
    filter(!str_to_lower(word) %in% stop_words$word) %>% 
    filter(!str_detect(word, "\\d")) %>% 
    mutate(word = as.character(word)) %>% 
    arrange(desc(freq))-> books_text[[id]]
}


books_words_vec[!(str_to_lower(books_words_vec) %in% stop_words$word)] %>% 
  str_replace_all("[[:punct:]]", " ") %>% str_trim() -> book_punct_free_2

book_punct_free_2 %>%
  str_split("\\s+") %>%
  unlist() %>%
  table() %>%
  as.data.frame() -> book_words_df

dplyr::select(book_words_df, word = `.`, freq = Freq)%>%
  filter(str_length(word) >= 3) %>% 
  filter(!str_to_lower(word) %in% stop_words$word) %>% 
  filter(!str_detect(word, "\\d")) %>% 
  arrange(desc(freq))-> all_books_text

all_books_text %>% top_n(n = 20) %>% arrange(desc(freq)) %>% as.matrix() %>% as.data.frame()-> top_words
top_words$freq %>% as.character() %>% as.numeric() -> top_words$freq
ggplot(data = top_words,
       aes(
          x = reorder(word, freq),
          y = freq,
          fill = freq
))+
  geom_bar(stat = "identity")+
  guides(fill = F) +
  xlab("Word Count") +
  ylab("Frequency") +
  ggtitle("Words") + coord_flip()


۲. ابر لغات ۲۰۰ کلمه پرتکرار در رمان های چارلز دیکنز را رسم نمایید. این کار را با بسته wordcloud2 انجام دهید. برای دانلود می توانید به لینک زیر مراجعه کنید.

https://github.com/Lchiffon/wordcloud2

با استفاده از عکسی که در ابتدا متن آمده ابر لغاتی مانند شکل زیر رسم کنید. (راهنمایی: از ورودی figpath در دستور wordcloud2 استفاده نمایید.مثالی در زیر آورده شده است.)

wordcloud2(all_books_text %>% top_n(n = 200), figPath = dickens_picture)
wordcloud2(all_books_text %>% top_n(n = 200))

۳. اسم پنج شخصیت اصلی در هر رمان دیکنز را استخراج کنید و با نموداری تعداد دفعات تکرار شده بر حسب رمان را رسم نمایید. (مانند مثال کلاس در رسم اسامی شخصیت ها در سری هر پاتر)

char_names <- data.frame()
for(id in book_ids){
  book <- books_text[[id]]
  top_guys <- book[str_which(book$word, "[A-Z ,.'-]"), ]
  new_char <- top_guys %>% filter(!word %in% c("Sir", "Lady", "Miss")) %>% top_n(5) %>% mutate(book_id = id)
  new_char$book_name <- book_meta %>% filter(gutenberg_id == id) %>% .$title
  char_names <- rbind(char_names, new_char)
}
hchart(char_names,
       type= "column",
       hcaes(group= word,
             x = book_name, y = freq))

۴. در بسته tidytext داده ایی به نام sentiments وجود دارد که فضای احساسی لغات را مشخص می نماید. با استفاده از این داده نمودار ۲۰ لغت برتر negative و ۲۰ لغت برتر positive را در کنار هم رسم نمایید. با استفاده از این نمودار فضای حاکم بر داستان چگونه ارزیابی می کنید؟ (به طور مثال برای کتاب داستان دو شهر فضای احساسی داستان به ترتیب تکرر در نمودار زیر قابل مشاهده است.)

برای تشخیص مثبت یا منفی بودن فضای داستان از کلمات bing استفاده می کنیم. نتیجه ی این بررسی و بررسی نمودارها این است که فضای حاکم بر رمان های چارلز دیکنز فضای نسبتن منفی ای است.

library(tidyr)
for(id in book_ids){
  book <- books_text[[id]]
  bing_word_counts <- book %>%
    inner_join(get_sentiments("bing")) %>% group_by(sentiment) %>% 
    top_n(wt = freq, n = 20) %>% 
    ungroup()
  bing_word_counts
  book_name <- book_meta %>% filter(gutenberg_id == id) %>% .$title
  print(ggplot(bing_word_counts, aes(x = reorder(word, freq), y = freq, fill = sentiment))+
    geom_bar(stat = "identity")+
    ggtitle(book_name)+
    theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                                          size = 8, hjust = 1))+
    xlab("Word") +
    ylab("Frequency") +coord_flip()+  facet_wrap(~sentiment, scales = "free_y"))
}


۵. متن داستان بینوایان را به ۲۰۰ قسمت مساوی تقسیم کنید. برای هر قسمت تعداد لغات positive و negative را حساب کنید و سپس این دو سری زمانی را در کنار هم برای مشاهده فضای احساسی داستان رسم نمایید.

book <- LeMiserables
str_replace_all(book$text,"(?!')[[:punct:]]", " ") %>% str_trim() -> book_punct_free

book_punct_free %>%
  str_split("\\s+") %>%
  unlist() -> book_words_vector
books_words_vec <- c(books_words_vec, book_words_vector)
book_words_vector[!(str_to_lower(book_words_vector) %in% stop_words$word)] %>% 
  str_replace_all("[[:punct:]]", " ") %>% str_trim() -> book_punct_free_2

book_punct_free_2 %>%
  str_split("\\s+") %>%
  unlist() %>%
  as.data.frame() -> book_words_df

wcount <- nrow(book_words_df)

dplyr::select(book_words_df, word = `.`)%>%
  filter(str_length(word) >= 3) %>% 
  filter(!str_to_lower(word) %in% stop_words$word) %>% 
  filter(!str_detect(word, "\\d")) %>% 
  mutate(word = as.character(word)) %>% 
  mutate(part =  ceiling(200 * row_number() / wcount)) -> book_text

book_text %>% 
  mutate(word = tolower(word)) %>% 
  inner_join(get_sentiments("bing")) %>% 
  group_by(part) %>% 
  summarize(positive = sum(sentiment == "positive"), negative = sum(sentiment == "negative")) -> scores

ggplot(scores,aes(x = part)) +
  geom_bar(aes(y= positive, fill ="postive"), stat = "identity")+
  geom_bar(aes(y =  -negative, fill = "negative"), stat = "identity")+
  xlab("count") +
  ylab("part")+
  ggtitle("Le Miserables storyline")


۶. ابتدا ترکیبات دوتایی کلماتی که پشت سر هم می آیند را استخراج کنید و سپس نمودار ۳۰ جفت لغت پرتکرار را رسم نمایید.

all_dickens <- data.frame()
for (book in books) {
  
  book$gutenberg_id[1] -> id
  book_name <- book_meta %>% filter(gutenberg_id == id) %>% .$title
  
  rbind(all_dickens,book %>% 
          mutate(book = book_name)) -> all_dickens
}

dickens_bigrams <- all_dickens %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

bigrams_separated <- dickens_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigrams_united
## # A tibble: 331,696 x 3
##    gutenberg_id book                 bigram           
##           <int> <chr>                <chr>            
##  1           98 A Tale of Two Cities french revolution
##  2           98 A Tale of Two Cities charles dickens  
##  3           98 A Tale of Two Cities dickens contents 
##  4           98 A Tale of Two Cities contents book    
##  5           98 A Tale of Two Cities life chapter     
##  6           98 A Tale of Two Cities period chapter   
##  7           98 A Tale of Two Cities chapter ii       
##  8           98 A Tale of Two Cities mail chapter     
##  9           98 A Tale of Two Cities chapter iii      
## 10           98 A Tale of Two Cities night shadows    
## # ... with 331,686 more rows
bigram_counts <- bigrams_united %>% 
  count(bigram, sort = TRUE) %>% 
  head(30) -> top_bigrams

ggplot(top_bigrams, aes(x = reorder(bigram, n), y = n, fill = n))+
  geom_bar(stat = "identity")+
  theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1))+
  xlab("Bigram") +
  guides(fill = "none")+
  ylab("Frequency") +
  ggtitle("Charles Dickens Novels Bigrams")+
  coord_flip()


۷. جفت کلماتی که با she و یا he آغاز می شوند را استخراج کنید. بیست فعل پرتکراری که زنان و مردان در داستان های دیکنز انجام می دهند را استخراج کنید و نمودار آن را رسم نمایید.

bigrams_separated %>% filter(word1 == "he" | word1 == "she") -> bigrams_he_she
bigram_he_she <- bigrams_he_she %>%
  count(word1, word2, sort = TRUE) %>% 
  group_by(word1) %>% 
  top_n(wt = n, n = 20)-> top_verbs

ggplot(top_verbs, aes(x = reorder(word2, n), y = n, fill = word1))+
  geom_bar(stat = "identity")+
  theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1))+
  xlab("Verb") +
  guides(fill = "none")+
  ylab("Frequency") +
  ggtitle("Charles Dickens Novels Verbs")+
  coord_flip()+  facet_wrap(~word1, scales = "free")


۸. برای کتاب های دیکنز ابتدا هر فصل را جدا کنید. سپی برای هر فصل 1-gram, 2-gram را استخراج کنید. آیا توزیع N-gram در کارهای دیکنز یکسان است؟ با رسم نمودار هم این موضوع را بررسی کنید.

تعدادی از کلمات مهم کتاب ها را و جفت کلمات مهم را جدا می کنیم. سپس برای هر دو حالت unigram و bigram یا استفاده از chi-squared-test بررسی می کنیم که ببینیم آیا توزیع کلمات در میان فصول کتاب ها برابرند یا نه. در آخر هم برای چند فصل که در آن ها کلمات مهم زیاد آمده اند نمودار فراوانی ngram ها را می کشیم.

by_chapter <- all_dickens %>%
  group_by(book) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  ungroup() %>%
  filter(chapter > 0)



###Unigrams

Dickens_unigrams <- by_chapter %>%
  unnest_tokens(word, text, token = "ngrams", n = 1)

unigrams_filtered <- Dickens_unigrams %>%
  filter(!word %in% stop_words$word)

top_unigrams <- unigrams_filtered %>% 
  count(word, sort = TRUE) %>% 
  rename(count = n) %>% 
  top_n(wt = count, n = 20)

unigrams_counts <- unigrams_filtered %>% 
  filter(word %in% top_unigrams$word) %>% 
  group_by(book, chapter) %>% 
  count(word, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(word, count) %>%
  unite(document, book , chapter)
unigrams_counts[is.na(unigrams_counts)] <- 0

chisq.test(as.matrix(unigrams_counts %>% select(-document)))
## 
##  Pearson's Chi-squared test
## 
## data:  as.matrix(unigrams_counts %>% select(-document))
## X-squared = 69264, df = 15884, p-value < 2.2e-16
unigrams_counts_gathered <- unigrams_counts %>%
  gather(key = word, value = count, -document)

top_documents <- unigrams_counts_gathered %>% 
  group_by(document) %>% 
  summarize(count = sum(count)) %>%
  ungroup() %>% top_n(n = 20) %>% .$document

ggplot(unigrams_counts_gathered %>% filter(document %in% top_documents),
       aes(x = reorder(word, count), y = count, fill = document))+
  geom_bar(stat = "identity")+
  theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1))+
  xlab("Unigram") +
  guides(fill = "none")+
  ylab("Frequency") +
  ggtitle("Unigrams In Top Dickens Chapters")+
  coord_flip()+  facet_wrap(~document, scales = "free")

####Bigrams

Dickens_bigrams <- by_chapter %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

bigrams_separated <- Dickens_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

top_bigrams <- bigrams_united %>% 
  count(bigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  top_n(wt = count, n = 20)

bigrams_counts <- bigrams_united%>% 
  filter(bigram %in% top_bigrams$bigram) %>% 
  group_by(book, chapter) %>% 
  count(bigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(bigram, count) %>%
  unite(document, book , chapter)
bigrams_counts[is.na(bigrams_counts)] <- 0

chisq.test(as.matrix(bigrams_counts %>% select(-document)))
## 
##  Pearson's Chi-squared test
## 
## data:  as.matrix(bigrams_counts %>% select(-document))
## X-squared = 62516, df = 11666, p-value < 2.2e-16
bigrams_counts_gathered <- bigrams_counts %>%
  gather(key = bigram, value = count, -document)

top_documents <- bigrams_counts_gathered %>% 
  group_by(document) %>% 
  summarize(count = sum(count)) %>%
  ungroup() %>% top_n(n = 20) %>% .$document

ggplot(bigrams_counts_gathered %>% filter(document %in% top_documents),
       aes(x = reorder(bigram, count), y = count, fill = document))+
  geom_bar(stat = "identity")+
  theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1))+
  xlab("Bigram") +
  guides(fill = "none")+
  ylab("Frequency") +
  ggtitle("Bigrams In Top Dickens Chapters")+
  coord_flip()+  facet_wrap(~document, scales = "free_x")

####Trigrams

Dickens_trigrams <- by_chapter %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3)

trigrams_separated <- Dickens_trigrams %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ")

trigrams_filtered <- trigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  filter(!word3 %in% stop_words$word)
  

trigrams_united <- trigrams_filtered %>%
  unite(trigram, word1, word2, word3, sep = " ")
top_trigrams <- trigrams_united %>% 
  count(trigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  arrange(desc(count)) %>% 
  head(20)

trigrams_counts <- trigrams_united%>% 
  filter(trigram %in% top_trigrams$trigram) %>% 
  group_by(book, chapter) %>% 
  count(trigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(trigram, count) %>%
  unite(document, book , chapter)
trigrams_counts[is.na(trigrams_counts)] <- 0

chisq.test(as.matrix(trigrams_counts %>% select(-document)))
## 
##  Pearson's Chi-squared test
## 
## data:  as.matrix(trigrams_counts %>% select(-document))
## X-squared = 13965, df = 4294, p-value < 2.2e-16
trigrams_counts_gathered <- trigrams_counts %>%
  gather(key = trigram, value = count, -document)

top_documents <- trigrams_counts_gathered %>% 
  group_by(document) %>% 
  summarize(count = sum(count)) %>%
  ungroup() %>% top_n(n = 20) %>% .$document

ggplot(trigrams_counts_gathered %>% filter(document %in% top_documents),
       aes(x = reorder(trigram, count), y = count, fill = document))+
  geom_bar(stat = "identity")+
  theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1))+
  xlab("Trigram") +
  guides(fill = "none")+
  ylab("Frequency") +
  ggtitle("Trigrams In Top Dickens Chapters")+
  coord_flip()+  facet_wrap(~document, scales = "free_x")


۹. برای آثار ارنست همینگوی نیز تمرین ۸ را تکرار کنید. آیا بین آثار توزیع n-grams در بین آثار این دو نویسنده یکسان است؟

از آثار Jane Austin استفاده می کنیم.

برای اینکه ببینیم آیا توزیع ngram ها در بین این ۲ نویسنده برابر است یا نه از chi-squared-test برای تعداد
ngram های استفاده شده توسط هر نویسنده استفاده می کنیم.

###Jane Austen Books
library(janeaustenr)

all_Austin <- austen_books()
by_chapter <- all_Austin %>%
  group_by(book) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  ungroup() %>%
  filter(chapter > 0)



###Unigrams

Austin_unigrams <- by_chapter %>%
  unnest_tokens(word, text, token = "ngrams", n = 1)

unigrams_filtered <- Austin_unigrams %>%
  filter(!word %in% stop_words$word)

top_unigrams <- unigrams_filtered %>% 
  count(word, sort = TRUE) %>% 
  rename(count = n) %>% 
  top_n(wt = count, n = 20)

unigrams_counts <- unigrams_filtered %>% 
  filter(word %in% top_unigrams$word) %>% 
  group_by(book, chapter) %>% 
  count(word, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(word, count) %>%
  unite(document, book , chapter)
unigrams_counts[is.na(unigrams_counts)] <- 0

chisq.test(as.matrix(unigrams_counts %>% select(-document)))
## 
##  Pearson's Chi-squared test
## 
## data:  as.matrix(unigrams_counts %>% select(-document))
## X-squared = 25326, df = 5092, p-value < 2.2e-16
unigrams_counts_gathered <- unigrams_counts %>%
  gather(key = word, value = count, -document)

top_documents <- unigrams_counts_gathered %>% 
  group_by(document) %>% 
  summarize(count = sum(count)) %>%
  ungroup() %>% top_n(n = 19) %>% .$document

ggplot(unigrams_counts_gathered %>% filter(document %in% top_documents),
       aes(x = reorder(word, count), y = count, fill = document))+
  geom_bar(stat = "identity")+
  theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1))+
  xlab("Unigram") +
  guides(fill = "none")+
  ylab("Frequency") +
  ggtitle("Unigrams In Top Austin Chapters")+
  coord_flip()+  facet_wrap(~document, scales = "free")

####Bigrams

Austin_bigrams <- by_chapter %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

bigrams_separated <- Austin_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

top_bigrams <- bigrams_united %>% 
  count(bigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  top_n(wt = count, n = 20)

bigrams_counts <- bigrams_united%>% 
  filter(bigram %in% top_bigrams$bigram) %>% 
  group_by(book, chapter) %>% 
  count(bigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(bigram, count) %>%
  unite(document, book , chapter)
bigrams_counts[is.na(bigrams_counts)] <- 0

chisq.test(as.matrix(bigrams_counts %>% select(-document)))
## 
##  Pearson's Chi-squared test
## 
## data:  as.matrix(bigrams_counts %>% select(-document))
## X-squared = 23167, df = 4598, p-value < 2.2e-16
bigrams_counts_gathered <- bigrams_counts %>%
  gather(key = bigram, value = count, -document)

top_documents <- bigrams_counts_gathered %>% 
  group_by(document) %>% 
  summarize(count = sum(count)) %>%
  ungroup() %>% top_n(n = 19) %>% .$document

ggplot(bigrams_counts_gathered %>% filter(document %in% top_documents),
       aes(x = reorder(bigram, count), y = count, fill = document))+
  geom_bar(stat = "identity")+
  theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1))+
  xlab("Bxigram") +
  guides(fill = "none")+
  ylab("Frequency") +
  ggtitle("Bigrams In Top Austin Chapters")+
  coord_flip()+  facet_wrap(~document, scales = "free_x")

###### versus

together <- rbind(all_Austin %>% mutate(writer = "Austin"),
                  all_dickens %>% mutate(writer = "Dickens") %>% select(-gutenberg_id))


###Unigrams

Versus_unigrams <- together %>%
  unnest_tokens(word, text, token = "ngrams", n = 1)

unigrams_filtered <- Versus_unigrams %>%
  filter(!word %in% stop_words$word)

top_unigrams <- unigrams_filtered %>% 
  count(word, sort = TRUE) %>% 
  rename(count = n) %>% 
  top_n(wt = count, n = 20)

unigrams_counts <- unigrams_filtered %>% 
  filter(word %in% top_unigrams$word) %>% 
  group_by(writer) %>% 
  count(word, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(word, count) %>% 
  ungroup()

unigrams_counts[is.na(unigrams_counts)] <- 0
chisq.test(as.matrix(unigrams_counts %>% select(-writer)))
## 
##  Pearson's Chi-squared test
## 
## data:  as.matrix(unigrams_counts %>% select(-writer))
## X-squared = 3295.8, df = 19, p-value < 2.2e-16
unigrams_counts_gathered <- unigrams_counts %>%
  gather(key = word, value = count, -writer)

ggplot(unigrams_counts_gathered,
       aes(x = reorder(word, count), y = count, fill = writer))+
  geom_bar(stat = "identity")+
  theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1))+
  xlab("Unigram") +
  guides(fill = "none")+
  ylab("Frequency") +
  ggtitle("Unigrams")+
  coord_flip()+  facet_wrap(~writer, scales = "free")

####Bigrams

Versus_bigrams <- together %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

bigrams_separated <- Versus_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

top_bigrams <- bigrams_united %>% 
  count(bigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  top_n(wt = count, n = 50)

bigrams_counts <- bigrams_united%>% 
  filter(bigram %in% top_bigrams$bigram) %>% 
  group_by(writer) %>% 
  count(bigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(bigram, count) %>%
  ungroup()
bigrams_counts[is.na(bigrams_counts)] <- 0

chisq.test(as.matrix(bigrams_counts %>% select(-writer)))
## 
##  Pearson's Chi-squared test
## 
## data:  as.matrix(bigrams_counts %>% select(-writer))
## X-squared = 7205.8, df = 49, p-value < 2.2e-16
bigrams_counts_gathered <- bigrams_counts %>%
  gather(key = bigram, value = count, -writer)

ggplot(bigrams_counts_gathered,
       aes(x = reorder(bigram, count), y = count, fill = writer))+
  geom_bar(stat = "identity")+
  theme(axis.text.y = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1))+
  xlab("Bigram") +
  guides(fill = "none")+
  ylab("Frequency") +
  ggtitle("Bigrams")+
  coord_flip()+  facet_wrap(~writer, scales = "free_x")


۱۰. بر اساس دادهایی که در تمرین ۸ و ۹ از آثار دو نویسنده به دست آوردید و با استفاده از N-gram ها یک مدل لاجستیک برای تشخیص صاحب اثر بسازید. خطای مدل چقدر است؟ برای یادگیری مدل از کتاب کتاب الیور تویست اثر دیکنز و کتاب پیرمرد و دریا استفاده نکنید. پس از ساختن مدل برای تست کردن فصل های این کتابها را به عنوان داده ورودی به مدل بدهید. خطای تشخیص چقدر است؟

austin_text <- all_Austin %>%
  group_by(book) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  ungroup() %>%
  filter(chapter > 0) %>%
  mutate(writer = "Austin")

dickens_text <- all_dickens %>%
  group_by(book) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  ungroup() %>%
  filter(chapter > 0) %>%
  mutate(writer = "Dickens") %>% 
  select(-gutenberg_id)

together_text <- rbind(austin_text, dickens_text)

######### We will use Oliver Twist and Sense and Sensibility books for our test data

## Train Data
train_text <- together_text %>% filter(book != "Sense & Sensibility", book !="Oliver Twist")

### Unigrams
train_unigrams <- train_text %>%
  unnest_tokens(word, text, token = "ngrams", n = 1)

train_unigrams_filtered <- train_unigrams %>%
  filter(!word %in% stop_words$word)

train_top_unigrams <- train_unigrams_filtered %>% 
  count(word, sort = TRUE) %>% 
  rename(count = n) %>% 
  top_n(wt = count, n = 20)

train_unigrams_counts <- train_unigrams_filtered %>% 
  filter(word %in% train_top_unigrams$word) %>% 
  group_by(book, chapter, writer) %>% 
  count(word, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(word, count) %>%
  ungroup() %>% 
  unite(document, book, chapter, writer)
train_unigrams_counts[is.na(train_unigrams_counts)] <- 0

### Bigrams

train_bigrams <- train_text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

train_bigrams_separated <- train_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

train_bigrams_filtered <- train_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

train_bigrams_united <- train_bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

train_top_bigrams <- train_bigrams_united %>% 
  count(bigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  top_n(wt = count, n = 10)

train_bigrams_counts <- train_bigrams_united%>% 
  filter(bigram %in% train_top_bigrams$bigram) %>% 
  group_by(book, chapter, writer) %>% 
  count(bigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(bigram, count) %>%
  ungroup() %>% 
  unite(document, book, chapter, writer)

train_bigrams_counts[is.na(train_bigrams_counts)] <- 0

train_data <- full_join(train_unigrams_counts, train_bigrams_counts, by = "document")
train_data[is.na(train_data)] <- 0
train_data <- train_data %>%
  separate(document, c("book", "chapter", "writer"), sep = "_") %>% 
  mutate(writer =1 * (writer == ("Dickens")))

####  model
model <- glm(
  writer~.,
  data = train_data %>% select(-book,-chapter,-house,-looked,-`miss tox`,
                               -`miss havisham`,-`captain cuttle`,-`sir leicester`,
                                -`sir mulberry`,-`sir returned`,-mind,-dear,
                               -`sir replied`,-replied,-`dear sir`,-sir,-door,-`sir thomas`,-eyes),
  family = binomial)
summary(model)
## 
## Call:
## glm(formula = writer ~ ., family = binomial, data = train_data %>% 
##     select(-book, -chapter, -house, -looked, -`miss tox`, -`miss havisham`, 
##         -`captain cuttle`, -`sir leicester`, -`sir mulberry`, 
##         -`sir returned`, -mind, -dear, -`sir replied`, -replied, 
##         -`dear sir`, -sir, -door, -`sir thomas`, -eyes))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.78256   0.00000   0.00123   0.03199   2.64175  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.60519    0.40552  -6.424 1.32e-10 ***
## cried       -0.34607    0.08851  -3.910 9.22e-05 ***
## day         -0.16580    0.08298  -1.998 0.045721 *  
## friend      -0.27648    0.07793  -3.548 0.000388 ***
## gentleman    0.43335    0.11197   3.870 0.000109 ***
## hand         0.70101    0.12391   5.658 1.54e-08 ***
## head         0.89726    0.14105   6.361 2.00e-10 ***
## lady        -0.11100    0.03967  -2.798 0.005145 ** 
## life         0.52084    0.13565   3.839 0.000123 ***
## miss        -0.06301    0.01772  -3.556 0.000376 ***
## night        0.85695    0.13855   6.185 6.20e-10 ***
## returned     0.58906    0.13534   4.352 1.35e-05 ***
## time        -0.22156    0.06533  -3.392 0.000695 ***
## `ha ha`      1.39594    0.60411   2.311 0.020846 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1052.76  on 1002  degrees of freedom
## Residual deviance:  216.22  on  989  degrees of freedom
## AIC: 244.22
## 
## Number of Fisher Scoring iterations: 10
## Test Data
test_text <- together_text %>% filter(book == "Sense & Sensibility" | book =="Oliver Twist")

### Unigrams
test_unigrams <- test_text %>%
  unnest_tokens(word, text, token = "ngrams", n = 1)

test_unigrams_filtered <- test_unigrams %>%
  filter(!word %in% stop_words$word)

test_unigrams_counts <- test_unigrams_filtered %>% 
  filter(word %in% train_top_unigrams$word) %>% 
  group_by(book, chapter, writer) %>% 
  count(word, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(word, count) %>%
  ungroup() %>% 
  unite(document, book, chapter, writer)
test_unigrams_counts[is.na(test_unigrams_counts)] <- 0

### Bigrams

test_bigrams <- test_text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

test_bigrams_separated <- test_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

test_bigrams_filtered <- test_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

test_bigrams_united <- test_bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

test_bigrams_counts <- test_bigrams_united%>% 
  filter(bigram %in% train_top_bigrams$bigram) %>% 
  group_by(book, chapter, writer) %>% 
  count(bigram, sort = TRUE) %>% 
  rename(count = n) %>% 
  spread(bigram, count) %>%
  ungroup() %>% 
  unite(document, book, chapter, writer)

test_bigrams_counts[is.na(test_bigrams_counts)] <- 0

test_data <- full_join(test_unigrams_counts, test_bigrams_counts, by = "document")
test_data[is.na(test_data)] <- 0
test_data <- test_data %>%  
  separate(document, c("book", "chapter", "writer"), sep = "_") %>% 
  mutate(writer = 1 * (writer == ("Dickens")))

train_data$prediction = predict(model, newdata = train_data , type = "response")

#setting cutoff
cost_fp = 100;cost_fn = 100
roc_info = ROCInfo( data = train_data, predict = "prediction", 
                    actual = "writer", cost.fp = cost_fp, cost.fn = cost_fn )
roc_info$cutoff -> co

P <- train_data %>% filter(prediction > 0.5) %>% count()
N <- train_data %>% filter(prediction <= co) %>% count()
TP <- train_data %>% filter(prediction > co,writer == 1) %>% count()
TN <- train_data %>% filter(prediction <= co, writer == 0) %>% count()
FP <- train_data %>% filter(prediction > co,writer== 0) %>% count()
FN <- train_data %>% filter(prediction <= co, writer == 1) %>% count()
ACC <- (TP + TN)/ (P + N)
sprintf("Accuracy (ACC): %.3f", ACC)
## [1] "Accuracy (ACC): 0.962"
sprintf("Error (ERR): %.3f%%", (1 - ACC)* 100)
## [1] "Error (ERR): 3.808%"
test_data$prediction = predict(model, newdata = test_data , type = "response")
cm_info = ConfusionMatrixInfo( data = test_data, predict = "prediction", 
                               actual = "writer", cutoff = co)

cm_info$plot

P <- test_data %>% filter(prediction > co) %>% count()
N <- test_data %>% filter(prediction <= co) %>% count()
TP <- test_data %>% filter(prediction > co,writer == 1) %>% count()
TN <- test_data %>% filter(prediction <= co, writer == 0) %>% count()
FP <- test_data %>% filter(prediction > co,writer== 0) %>% count()
FN <- test_data %>% filter(prediction <= co, writer == 1) %>% count()
ACC <- (TP + TN)/ (P + N)
sprintf("Accuracy (ACC): %.3f", ACC)
## [1] "Accuracy (ACC): 0.922"
sprintf("Error (ERR): %.3f%%", (1 - ACC)* 100)
## [1] "Error (ERR): 7.767%"